###TASK 1
## A)
#reading the data
games=readRDS("/users/tarkantemizoz/desktop/some_league_id_matches.rds")
odds=readRDS("/users/tarkantemizoz/desktop/some_league_id_odd_details.rds")
library(data.table)
#taking over under scores as class values
matches_class=games[,c("leagueId","home","away","type","date"):=NULL]
matches_class=matches_class[,c("H","A"):=tstrsplit(score,':')]
matches_class=matches_class[,score:=NULL]
str(matches_class)
## Classes 'data.table' and 'data.frame': 3119 obs. of 3 variables:
## $ matchId: chr "KjF6FiA6" "ILVbJgQm" "SGIEDVvJ" "YwL5xFHJ" ...
## $ H : chr "0" "3" "2" "0" ...
## $ A : chr "0" "0" "1" "0" ...
## - attr(*, ".internal.selfref")=<externalptr>
#transforming the structure of "H" and "A" into numeric from character
matches_class$H=as.numeric(matches_class$H)
matches_class$A=as.numeric(matches_class$A)
matches_class=matches_class[H+A<2.5,over_under:=0]
matches_class=matches_class[H+A>2.5,over_under:=1]
matches_class=matches_class[,c("H","A"):=NULL]
##creating of feature vector, our aim is having the largest feature vector with most number of instances(games)
##odds and bookmaker selection(no handicap case), 'draw no bet' odds are discarded since there are very few of them to combine with other odds
pcabet=odds[betType=='1x2' | betType=='dc' | betType=='bts']
pcabet=pcabet[bookmaker=='youwin' | bookmaker=='bwin' | bookmaker=='Unibet' | bookmaker=='bet365' | bookmaker=='Betway'| bookmaker=='Pinnacle']
#final odds are taken
pcabet=pcabet[order(matchId,betType,oddtype,bookmaker,date)]
pcabet=pcabet[,list(final_odd=odd[.N]),by=list(matchId,oddtype,bookmaker)]
pcabet=dcast(pcabet,matchId~bookmaker+oddtype,value.var='final_odd')
#odds selection for handicap 2.5, 'asian handicap' odds are discarded, since 'Pinnacle' does not have enough odds for other handicap cases, we will include these cases without 'Pinnacle' below
pcabet_ou=odds[betType=='ou']
pcabet_ou=odds[totalhandicap=='2.5']
pcabet_ou=pcabet_ou[bookmaker=='youwin' | bookmaker=='bwin' | bookmaker=='Unibet' | bookmaker=='bet365' | bookmaker=='Betway'| bookmaker=='Pinnacle']
pcabet_ou=pcabet_ou[order(matchId,betType,oddtype,date,totalhandicap,bookmaker)]
pcabet_ou=pcabet_ou[,list(final_odd=odd[.N]),by=list(matchId,oddtype,totalhandicap,bookmaker)]
pcabet_ou=dcast(pcabet_ou,matchId~bookmaker+oddtype+totalhandicap,value.var='final_odd')
pcabet=merge(pcabet,pcabet_ou,by='matchId')
#odds selection for handicap 1.5 and 3.5, 'asian handicap' odds are discarded
pcabet_ou=odds[betType=='ou']
pcabet_ou=odds[totalhandicap=='1.5'| totalhandicap=='3.5']
pcabet_ou=pcabet_ou[bookmaker=='youwin' | bookmaker=='bwin' | bookmaker=='Unibet' | bookmaker=='bet365' | bookmaker=='Betway']
pcabet_ou=pcabet_ou[order(matchId,betType,oddtype,date,totalhandicap,bookmaker)]
pcabet_ou=pcabet_ou[,list(final_odd=odd[.N]),by=list(matchId,oddtype,totalhandicap,bookmaker)]
pcabet_ou=dcast(pcabet_ou,matchId~bookmaker+oddtype+totalhandicap,value.var='final_odd')
#taking over under score as a class value to our matriix
pcabet=merge(pcabet,pcabet_ou,by='matchId')
pcabet=merge(pcabet,matches_class,by='matchId')
pcabet=pcabet[,matchId:=NULL]
#we need to have no N/A's in our feature vectors
pcabet=pcabet[complete.cases(pcabet)]
pcabet=pcabet[order(over_under)]
#class of a particular game shows whether it ended over or under: 1=under,2=over
class=pcabet$over_under+1
#for visualisation purposes we create class_O and class_U, while plotting, col=class shows white points for class value 0, therefore we can only see other class values
class_O=replace(class, class==1, 0)
class_U=replace(class, class==2, 0)
pcabet=pcabet[,over_under:=NULL]
#applying PCA
pca=princomp(pcabet)
plot(pca)

#see that most of the variances are covered by component 1 and 2
str(pca)
## List of 7
## $ sdev : Named num [1:75] 8.874 4.673 2.045 0.619 0.505 ...
## ..- attr(*, "names")= chr [1:75] "Comp.1" "Comp.2" "Comp.3" "Comp.4" ...
## $ loadings: loadings [1:75, 1:75] 0.00423 0.04335 0.00647 -0.08487 -0.00716 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:75] "Betway_12" "Betway_1X" "Betway_NO" "Betway_X2" ...
## .. ..$ : chr [1:75] "Comp.1" "Comp.2" "Comp.3" "Comp.4" ...
## $ center : Named num [1:75] 1.29 1.55 1.92 1.83 1.83 ...
## ..- attr(*, "names")= chr [1:75] "Betway_12" "Betway_1X" "Betway_NO" "Betway_X2" ...
## $ scale : Named num [1:75] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "names")= chr [1:75] "Betway_12" "Betway_1X" "Betway_NO" "Betway_X2" ...
## $ n.obs : int 975
## $ scores : num [1:975, 1:75] -2.09 6.02 1.99 -3.99 -3.99 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : NULL
## .. ..$ : chr [1:75] "Comp.1" "Comp.2" "Comp.3" "Comp.4" ...
## $ call : language princomp(x = pcabet)
## - attr(*, "class")= chr "princomp"
summary(pca)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4
## Standard deviation 8.8737507 4.6726567 2.04520026 0.61886570
## Proportion of Variance 0.7410025 0.2054627 0.03936198 0.00360411
## Cumulative Proportion 0.7410025 0.9464652 0.98582715 0.98943126
## Comp.5 Comp.6 Comp.7 Comp.8
## Standard deviation 0.50530171 0.367147381 0.2940071236 0.2856199358
## Proportion of Variance 0.00240274 0.001268487 0.0008134314 0.0007676836
## Cumulative Proportion 0.99183400 0.993102484 0.9939159151 0.9946835987
## Comp.9 Comp.10 Comp.11 Comp.12
## Standard deviation 0.2794423403 0.2409931840 0.2246886325 0.1968759871
## Proportion of Variance 0.0007348347 0.0005465309 0.0004750807 0.0003647461
## Cumulative Proportion 0.9954184334 0.9959649643 0.9964400450 0.9968047912
## Comp.13 Comp.14 Comp.15 Comp.16
## Standard deviation 0.1795607427 0.1766432956 0.1668326812 0.1573888154
## Proportion of Variance 0.0003034087 0.0002936294 0.0002619193 0.0002331057
## Cumulative Proportion 0.9971081999 0.9974018293 0.9976637485 0.9978968543
## Comp.17 Comp.18 Comp.19 Comp.20
## Standard deviation 0.1524493365 0.1468596452 0.1313442589 0.1260359736
## Proportion of Variance 0.0002187038 0.0002029599 0.0001623407 0.0001494838
## Cumulative Proportion 0.9981155581 0.9983185179 0.9984808586 0.9986303425
## Comp.21 Comp.22 Comp.23 Comp.24
## Standard deviation 0.1220260627 0.1150567441 0.1120252693 0.1066461453
## Proportion of Variance 0.0001401233 0.0001245746 0.0001180966 0.0001070275
## Cumulative Proportion 0.9987704658 0.9988950403 0.9990131369 0.9991201644
## Comp.25 Comp.26 Comp.27 Comp.28
## Standard deviation 1.017491e-01 9.826941e-02 8.962061e-02 8.911070e-02
## Proportion of Variance 9.742403e-05 9.087447e-05 7.558246e-05 7.472483e-05
## Cumulative Proportion 9.992176e-01 9.993085e-01 9.993840e-01 9.994588e-01
## Comp.29 Comp.30 Comp.31 Comp.32
## Standard deviation 8.247061e-02 7.857755e-02 7.612454e-02 7.221315e-02
## Proportion of Variance 6.400349e-05 5.810349e-05 5.453239e-05 4.907246e-05
## Cumulative Proportion 9.995228e-01 9.995809e-01 9.996354e-01 9.996845e-01
## Comp.33 Comp.34 Comp.35 Comp.36
## Standard deviation 6.570561e-02 0.0583083827 5.432719e-02 5.201225e-02
## Proportion of Variance 4.062657e-05 0.0000319939 2.777408e-05 2.545755e-05
## Cumulative Proportion 9.997251e-01 0.9997571025 9.997849e-01 9.998103e-01
## Comp.37 Comp.38 Comp.39 Comp.40
## Standard deviation 4.672127e-02 4.203174e-02 4.167827e-02 3.849737e-02
## Proportion of Variance 2.054161e-05 1.662494e-05 1.634649e-05 1.394657e-05
## Cumulative Proportion 9.998309e-01 9.998475e-01 9.998638e-01 9.998778e-01
## Comp.41 Comp.42 Comp.43 Comp.44
## Standard deviation 3.775962e-02 3.509797e-02 3.348165e-02 3.201001e-02
## Proportion of Variance 1.341715e-05 1.159229e-05 1.054919e-05 9.642217e-06
## Cumulative Proportion 9.998912e-01 9.999028e-01 9.999134e-01 9.999230e-01
## Comp.45 Comp.46 Comp.47 Comp.48
## Standard deviation 3.013184e-02 2.618308e-02 2.407234e-02 2.300554e-02
## Proportion of Variance 8.543905e-06 6.451293e-06 5.453080e-06 4.980468e-06
## Cumulative Proportion 9.999315e-01 9.999380e-01 9.999434e-01 9.999484e-01
## Comp.49 Comp.50 Comp.51 Comp.52
## Standard deviation 2.253229e-02 2.166182e-02 2.106845e-02 2.001745e-02
## Proportion of Variance 4.777668e-06 4.415653e-06 4.177058e-06 3.770706e-06
## Cumulative Proportion 9.999532e-01 9.999576e-01 9.999618e-01 9.999656e-01
## Comp.53 Comp.54 Comp.55 Comp.56
## Standard deviation 1.928297e-02 1.817626e-02 1.761943e-02 1.661393e-02
## Proportion of Variance 3.499072e-06 3.108953e-06 2.921385e-06 2.597467e-06
## Cumulative Proportion 9.999691e-01 9.999722e-01 9.999751e-01 9.999777e-01
## Comp.57 Comp.58 Comp.59 Comp.60
## Standard deviation 1.655757e-02 1.585616e-02 1.500785e-02 1.448952e-02
## Proportion of Variance 2.579873e-06 2.365926e-06 2.119542e-06 1.975666e-06
## Cumulative Proportion 9.999803e-01 9.999826e-01 9.999848e-01 9.999867e-01
## Comp.61 Comp.62 Comp.63 Comp.64
## Standard deviation 1.293725e-02 1.238666e-02 1.210863e-02 1.171495e-02
## Proportion of Variance 1.575030e-06 1.443822e-06 1.379733e-06 1.291476e-06
## Cumulative Proportion 9.999883e-01 9.999898e-01 9.999911e-01 9.999924e-01
## Comp.65 Comp.66 Comp.67 Comp.68
## Standard deviation 1.149487e-02 1.073942e-02 1.005693e-02 9.742871e-03
## Proportion of Variance 1.243407e-06 1.085343e-06 9.517795e-07 8.932625e-07
## Cumulative Proportion 9.999937e-01 9.999948e-01 9.999957e-01 9.999966e-01
## Comp.69 Comp.70 Comp.71 Comp.72
## Standard deviation 9.264041e-03 8.253651e-03 7.300189e-03 6.607166e-03
## Proportion of Variance 8.076183e-07 6.410582e-07 5.015029e-07 4.108049e-07
## Cumulative Proportion 9.999974e-01 9.999980e-01 9.999985e-01 9.999990e-01
## Comp.73 Comp.74 Comp.75
## Standard deviation 6.482062e-03 5.967543e-03 5.761043e-03
## Proportion of Variance 3.953953e-07 3.351169e-07 3.123255e-07
## Cumulative Proportion 9.999994e-01 9.999997e-01 1.000000e+00
#components 1 and 2 together covers 0.95% of the variance
#covariance matrix, eigenvalues and eigenvectors
cov_pca = cov(pcabet)
eigenValues = eigen(cov_pca)$values
eigenVectors = eigen(cov_pca)$vectors
eigenValues
## [1] 7.882430e+01 2.185614e+01 4.187139e+00 3.833880e-01 2.555920e-01
## [6] 1.349356e-01 8.652894e-02 8.166250e-02 7.816819e-02 5.813734e-02
## [11] 5.053681e-02 3.879995e-02 3.227516e-02 3.123489e-02 2.786172e-02
## [16] 2.479667e-02 2.326466e-02 2.158990e-02 1.726903e-02 1.590138e-02
## [21] 1.490565e-02 1.325165e-02 1.256255e-02 1.138508e-02 1.036350e-02
## [26] 9.666791e-03 8.040100e-03 7.948870e-03 6.808385e-03 6.180771e-03
## [31] 5.800895e-03 5.220094e-03 4.321660e-03 3.403358e-03 2.954473e-03
## [36] 2.708052e-03 2.185118e-03 1.768481e-03 1.738862e-03 1.483569e-03
## [41] 1.427253e-03 1.233132e-03 1.122172e-03 1.025693e-03 9.088598e-04
## [46] 6.862577e-04 5.800726e-04 5.297983e-04 5.082255e-04 4.697161e-04
## [51] 4.443355e-04 4.011096e-04 3.722145e-04 3.307155e-04 3.107629e-04
## [56] 2.763061e-04 2.744345e-04 2.516759e-04 2.254668e-04 2.101618e-04
## [61] 1.675442e-04 1.535869e-04 1.467694e-04 1.373810e-04 1.322676e-04
## [66] 1.154536e-04 1.012457e-04 9.502099e-05 8.591057e-05 6.819270e-05
## [71] 5.334748e-05 4.369946e-05 4.206026e-05 3.564813e-05 3.322369e-05
#we also here see that first two eigenvalues are remarkably higher than the others
eigenVectors[,1]
## [1] -0.004230263 -0.043353051 -0.006465289 0.084874899 0.007156830
## [6] -0.158339449 0.350356508 0.063334535 -0.167376682 0.382676667
## [11] 0.070073310 -0.004204353 -0.047673261 -0.006757607 0.098098860
## [16] 0.007177202 -0.164535562 0.370164906 0.066412026 -0.003925536
## [21] -0.044757768 -0.006078701 0.091958536 0.006805250 -0.167797436
## [26] 0.384144363 0.070145358 -0.003949494 -0.045001933 -0.006467321
## [31] 0.089821374 0.007281335 -0.154878739 0.325611332 0.058782814
## [36] -0.004049944 -0.040728577 -0.006152799 0.078975670 0.006700338
## [41] -0.152266049 0.316139502 0.061879177 -0.008960901 0.012310937
## [46] -0.009739769 0.012733600 -0.009338109 0.013629589 -0.009131554
## [51] 0.012670381 -0.008465491 0.012187904 -0.008586974 0.011911712
## [56] -0.003056553 -0.021072765 0.032634620 0.005210274 -0.003132525
## [61] -0.024887716 0.039316306 0.005624373 -0.002804729 -0.022672694
## [66] 0.033010703 0.005290901 -0.002867389 -0.019609933 0.028085255
## [71] 0.005257677 -0.002994018 -0.020266360 0.027890840 0.005218055
eigenVectors[,2]
## [1] 0.012899531 -0.082322642 0.005068932 -0.018354074 -0.006302099
## [6] -0.306850544 -0.098979380 -0.163848416 -0.327387653 -0.114177691
## [11] -0.180196736 0.013417804 -0.093509779 0.004549756 -0.025704846
## [16] -0.005471153 -0.322811770 -0.107596947 -0.171553145 0.012547399
## [21] -0.088021660 0.004562282 -0.023676775 -0.005553349 -0.333031479
## [26] -0.118576235 -0.177792332 0.012334504 -0.087263921 0.004684986
## [31] -0.021937078 -0.005868219 -0.298192465 -0.077792920 -0.154545800
## [36] 0.012366071 -0.075924760 0.004120727 -0.014684170 -0.005006569
## [41] -0.292991596 -0.067857467 -0.157766045 0.037806435 -0.044741474
## [46] 0.042376185 -0.047694809 0.041302846 -0.050489363 0.039965340
## [51] -0.047737122 0.037274744 -0.045109943 0.036459735 -0.043685054
## [56] 0.012546434 0.091379203 -0.116334900 -0.019736962 0.012743516
## [61] 0.109458532 -0.132700910 -0.020736007 0.012550212 0.100502154
## [66] -0.127107918 -0.019850682 0.012648724 0.087965467 -0.104494548
## [71] -0.019701178 0.012720211 0.087437646 -0.106945827 -0.019351289
#we take first two components and plot their corresponding coordinates with color_coded base of class values
#after PCA, we end up with 2-D representation of the data
plot(pca$scores[,1],pca$scores[,2],main='PCA',xlab='pca$scores[,1]', ylab='pca$scores[,2]',col=class)

plot(pca$scores[,1],pca$scores[,2],main='PCA',xlab='pca$scores[,1]', ylab='pca$scores[,2]',col=class_U)

plot(pca$scores[,1],pca$scores[,2],main='PCA',xlab='pca$scores[,1]', ylab='pca$scores[,2]',col=class_O)

#we see that after PCA we did not achive any remarkable conclusions, over and under results are overlapping.
## B)
#distance matrix for euclidean distance
datadist_eu=dist(pcabet, method="euclidean")
datadist_eu[is.na(datadist_eu)]=0
#distance matrix for manhattan distance
datadist_m=dist(pcabet, method="manhattan")
datadist_m[is.na(datadist_m)]=0
#mds with euclidean distance
mds_eu=cmdscale(datadist_eu)
plot(mds_eu[,1],mds_eu[,2],main='MDS_Euclidean',xlab='', ylab='',col=class)

#mds with manhattan distance
mds_m=cmdscale(datadist_m)
plot(mds_m[,1],mds_m[,2],main='MDS_Manhattan',xlab='', ylab='',col=class)

#the shapes we obtain are very similar in both cases, we see that the color coded class values, namely game results are more spread for the manhattan distance case. In addition, Manhattan mapping looks like a mirrored one of the other.
## C)
#we see that PCA and MDS results are very similar, they are like mirrored image of one other. MDS with manhattan distance represents the data in a more spread way. Regarding the representation of the data they all present similar results that we cannot obtain significant information from the odd data for the game outcomes, maybe we should change the odd types we added to create feature vectors. Logically, the two methods are different, but in our example the results in PCA and in MDS with euclidean distance are pretty much equivalent. In PCA, we are given the multidimensional data and we figure out that we don't need many dimensions to conceptualize them. So we reduce the dimensions. In MDS, you are given the matrix of distances between the observations, and we are trying to figure out what the locations of these data in space are. The only common goal of PCA and MDS is to visualize objects. But their inputs are different.
###TASK 2
##operations on matches_scores data
#removing the unnecessary columns for this task, namely leagueId, home and away teams, type and date information
matches=readRDS("/users/tarkantemizoz/desktop/some_league_id_matches.rds")
matches_scores_1X2=matches[,c("leagueId","home","away","type","date"):=NULL]
#getting match scores into two different columns
matches_scores_1X2=matches_scores_1X2[,c("H","A"):=tstrsplit(score,':')]
matches_scores_1X2=matches_scores_1X2[,score:=NULL]
#checking structure of matches_scores data to see the of transformations
str(matches_scores_1X2)
## Classes 'data.table' and 'data.frame': 3119 obs. of 3 variables:
## $ matchId: chr "KjF6FiA6" "ILVbJgQm" "SGIEDVvJ" "YwL5xFHJ" ...
## $ H : chr "0" "3" "2" "0" ...
## $ A : chr "0" "0" "1" "0" ...
## - attr(*, ".internal.selfref")=<externalptr>
#transforming the structure of "H" and "A" into numeric from character
matches_scores_1X2$H=as.numeric(matches_scores_1X2$H)
matches_scores_1X2$A=as.numeric(matches_scores_1X2$A)
#determing which side won the match, home or away
matches_scores_1X2=matches_scores_1X2[,score:=0]
matches_scores_1X2=matches_scores_1X2[H>A,score:=1]
matches_scores_1X2=matches_scores_1X2[H<A,score:=2]
matches_scores_1X2=matches_scores_1X2[,c("H","A"):=NULL]
##odds and bookmaker selection
pcabet_hta=odds[betType=='1x2' | betType=='dc' | betType=='bts']
pcabet_hta=pcabet_hta[bookmaker=='youwin' | bookmaker=='bwin' | bookmaker=='Unibet' | bookmaker=='bet365' | bookmaker=='Betway'| bookmaker=='Pinnacle']
pcabet_hta=pcabet_hta[order(matchId,betType,oddtype,bookmaker,date)]
pcabet_hta=pcabet_hta[,list(final_odd=odd[.N]),by=list(matchId,oddtype,bookmaker)]
pcabet_hta=dcast(pcabet_hta,matchId~bookmaker+oddtype,value.var='final_odd')
pcabet_ou_hta=odds[betType=='ou']
pcabet_ou_hta=odds[ totalhandicap=='2.5']
pcabet_ou_hta=pcabet_ou_hta[bookmaker=='youwin' | bookmaker=='bwin' | bookmaker=='Unibet' | bookmaker=='bet365' | bookmaker=='Betway'| bookmaker=='Pinnacle']
pcabet_ou_hta=pcabet_ou_hta[order(matchId,betType,oddtype,date,totalhandicap,bookmaker)]
pcabet_ou_hta=pcabet_ou_hta[,list(final_odd=odd[.N]),by=list(matchId,oddtype,totalhandicap,bookmaker)]
pcabet_ou_hta=dcast(pcabet_ou_hta,matchId~bookmaker+oddtype+totalhandicap,value.var='final_odd')
pcabet_hta=merge(pcabet_hta,pcabet_ou_hta,by='matchId')
pcabet_hta=merge(pcabet_hta,matches_scores_1X2,by='matchId')
pcabet_hta=pcabet_hta[,matchId:=NULL]
pcabet_hta=pcabet_hta[complete.cases(pcabet_hta)]
#pcabet_hta=pcabet_hta[order(over_under)]
class=pcabet_hta$score+1
pcabet_hta=pcabet_hta[,score:=NULL]
class_H=replace(class, class==1, 0)
class_H=replace(class_H, class==3, 0)
class_A=replace(class, class==1, 0)
class_A=replace(class_A, class==2, 0)
class_T=replace(class, class==2, 0)
class_T=replace(class_T, class==3, 0)
#applying PCA
pca=princomp(pcabet_hta)
plot(pca)

#see that most of the variances are covered by component 1 and 2
str(pca)
## List of 7
## $ sdev : Named num [1:55] 8.82 4.456 1.08 0.604 0.369 ...
## ..- attr(*, "names")= chr [1:55] "Comp.1" "Comp.2" "Comp.3" "Comp.4" ...
## $ loadings: loadings [1:55, 1:55] 0.00402 0.04485 0.00679 -0.08481 -0.00737 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:55] "Betway_12" "Betway_1X" "Betway_NO" "Betway_X2" ...
## .. ..$ : chr [1:55] "Comp.1" "Comp.2" "Comp.3" "Comp.4" ...
## $ center : Named num [1:55] 1.29 1.55 1.92 1.83 1.83 ...
## ..- attr(*, "names")= chr [1:55] "Betway_12" "Betway_1X" "Betway_NO" "Betway_X2" ...
## $ scale : Named num [1:55] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "names")= chr [1:55] "Betway_12" "Betway_1X" "Betway_NO" "Betway_X2" ...
## $ n.obs : int 1001
## $ scores : num [1:1001, 1:55] -2.25 -40.93 -40.93 5.82 2.96 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : NULL
## .. ..$ : chr [1:55] "Comp.1" "Comp.2" "Comp.3" "Comp.4" ...
## $ call : language princomp(x = pcabet_hta)
## - attr(*, "class")= chr "princomp"
summary(pca)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4
## Standard deviation 8.8201852 4.4561242 1.08047587 0.604285123
## Proportion of Variance 0.7784935 0.1987075 0.01168234 0.003654125
## Cumulative Proportion 0.7784935 0.9772010 0.98888330 0.992537421
## Comp.5 Comp.6 Comp.7 Comp.8
## Standard deviation 0.369471529 0.3052036656 0.2866151835 0.2622956467
## Proportion of Variance 0.001366034 0.0009321356 0.0008220495 0.0006884648
## Cumulative Proportion 0.993903456 0.9948355911 0.9956576406 0.9963461054
## Comp.9 Comp.10 Comp.11 Comp.12
## Standard deviation 0.2378117066 0.2229029509 0.1822654285 0.1692392068
## Proportion of Variance 0.0005659343 0.0004972001 0.0003324361 0.0002866167
## Cumulative Proportion 0.9969120398 0.9974092399 0.9977416760 0.9980282927
## Comp.13 Comp.14 Comp.15 Comp.16
## Standard deviation 0.1642493298 0.139517986 0.1304499385 0.1237016653
## Proportion of Variance 0.0002699646 0.000194787 0.0001702893 0.0001531266
## Cumulative Proportion 0.9982982573 0.998493044 0.9986633336 0.9988164602
## Comp.17 Comp.18 Comp.19 Comp.20
## Standard deviation 0.1182649810 0.1115087398 0.1083641269 9.849426e-02
## Proportion of Variance 0.0001399626 0.0001244278 0.0001175089 9.707815e-05
## Cumulative Proportion 0.9989564228 0.9990808506 0.9991983595 9.992954e-01
## Comp.21 Comp.22 Comp.23 Comp.24
## Standard deviation 9.142815e-02 8.848573e-02 7.732399e-02 7.432755e-02
## Proportion of Variance 8.364876e-05 7.835127e-05 5.983126e-05 5.528397e-05
## Cumulative Proportion 9.993791e-01 9.994574e-01 9.995173e-01 9.995726e-01
## Comp.25 Comp.26 Comp.27 Comp.28
## Standard deviation 7.161669e-02 6.873546e-02 6.580572e-02 6.016464e-02
## Proportion of Variance 5.132489e-05 4.727824e-05 4.333381e-05 3.622282e-05
## Cumulative Proportion 9.996239e-01 9.996712e-01 9.997145e-01 9.997507e-01
## Comp.29 Comp.30 Comp.31 Comp.32
## Standard deviation 5.690146e-02 5.307631e-02 4.763822e-02 4.623090e-02
## Proportion of Variance 3.240011e-05 2.819039e-05 2.270966e-05 2.138771e-05
## Cumulative Proportion 9.997831e-01 9.998113e-01 9.998340e-01 9.998554e-01
## Comp.33 Comp.34 Comp.35 Comp.36
## Standard deviation 4.321141e-02 0.0409633019 0.038708734 3.579805e-02
## Proportion of Variance 1.868515e-05 0.0000167915 0.000014994 1.282385e-05
## Cumulative Proportion 9.998741e-01 0.9998908771 0.999905871 9.999187e-01
## Comp.37 Comp.38 Comp.39 Comp.40
## Standard deviation 3.492717e-02 3.121806e-02 2.887121e-02 2.658160e-02
## Proportion of Variance 1.220749e-05 9.752398e-06 8.341217e-06 7.070692e-06
## Cumulative Proportion 9.999309e-01 9.999407e-01 9.999490e-01 9.999561e-01
## Comp.41 Comp.42 Comp.43 Comp.44
## Standard deviation 2.431813e-02 2.186717e-02 2.132832e-02 1.982745e-02
## Proportion of Variance 5.917796e-06 4.785032e-06 4.552110e-06 3.933989e-06
## Cumulative Proportion 9.999620e-01 9.999668e-01 9.999713e-01 9.999753e-01
## Comp.45 Comp.46 Comp.47 Comp.48
## Standard deviation 1.936419e-02 1.862204e-02 1.782231e-02 1.681739e-02
## Proportion of Variance 3.752305e-06 3.470196e-06 3.178540e-06 2.830198e-06
## Cumulative Proportion 9.999790e-01 9.999825e-01 9.999857e-01 9.999885e-01
## Comp.49 Comp.50 Comp.51 Comp.52
## Standard deviation 1.570523e-02 1.558921e-02 1.414443e-02 1.254828e-02
## Proportion of Variance 2.468245e-06 2.431912e-06 2.002030e-06 1.575680e-06
## Cumulative Proportion 9.999910e-01 9.999934e-01 9.999954e-01 9.999970e-01
## Comp.53 Comp.54 Comp.55
## Standard deviation 1.196211e-02 1.044146e-02 7.155021e-03
## Proportion of Variance 1.431908e-06 1.090993e-06 5.122966e-07
## Cumulative Proportion 9.999984e-01 9.999995e-01 1.000000e+00
#components 1 and 2 together covers 0.977% of the variance
#covariance matrix, eigenvalues and eigenvectors
cov_pca = cov(pcabet_hta)
eigenValues = eigen(cov_pca)$values
eigenVectors = eigen(cov_pca)$vectors
eigenValues
## [1] 7.787346e+01 1.987690e+01 1.168596e+00 3.655257e-01 1.366457e-01
## [6] 9.324243e-02 8.223041e-02 6.886781e-02 5.661096e-02 4.973541e-02
## [11] 3.325391e-02 2.867055e-02 2.700482e-02 1.948473e-02 1.703420e-02
## [16] 1.531740e-02 1.400059e-02 1.244663e-02 1.175453e-02 9.710821e-03
## [21] 8.367466e-03 7.837554e-03 5.984979e-03 5.530109e-03 5.134079e-03
## [26] 4.729288e-03 4.334723e-03 3.623403e-03 3.241014e-03 2.819912e-03
## [31] 2.271670e-03 2.139433e-03 1.869094e-03 1.679670e-03 1.499864e-03
## [36] 1.282782e-03 1.221127e-03 9.755419e-04 8.343800e-04 7.072882e-04
## [41] 5.919629e-04 4.786514e-04 4.553520e-04 3.935207e-04 3.753467e-04
## [46] 3.471271e-04 3.179525e-04 2.831075e-04 2.469009e-04 2.432665e-04
## [51] 2.002650e-04 1.576168e-04 1.432352e-04 1.091331e-04 5.124553e-05
#we also here see that first two eigenvalues are remarkably higher than the others
eigenVectors[,1]
## [1] 0.004021663 0.044845776 0.006791069 -0.084813003 -0.007373819
## [6] 0.163811233 -0.349944806 -0.060687873 0.173289048 -0.381979428
## [11] -0.067067767 0.003996004 0.049364057 0.007109425 -0.097977531
## [16] -0.007411366 0.170334866 -0.369766664 -0.063640593 0.003742103
## [21] 0.046358262 0.006397327 -0.091845477 -0.007049026 0.173689113
## [26] -0.383279245 -0.067334070 0.003762709 0.046520869 0.006791268
## [31] -0.089782059 -0.007517426 0.160111334 -0.325938699 -0.056479685
## [36] 0.003846048 0.042113956 0.006462125 -0.078980035 -0.006940566
## [41] 0.157608995 -0.316172977 -0.059322282 0.008055773 -0.011204960
## [46] 0.008734483 -0.011531398 0.008363887 -0.012371869 0.008169389
## [51] -0.011431881 0.007603698 -0.011099049 0.007728925 -0.010771647
eigenVectors[,2]
## [1] 0.01325458 -0.08531403 0.01155932 -0.02298575 -0.01213099
## [6] -0.32290704 -0.12560457 -0.17271156 -0.34457582 -0.14321402
## [11] -0.18976814 0.01375720 -0.09707266 0.01161956 -0.03123026
## [16] -0.01165159 -0.33973969 -0.13564511 -0.18078908 0.01286516
## [21] -0.09129859 0.01078699 -0.02881100 -0.01139730 -0.35022819
## [26] -0.14791119 -0.18759711 0.01268849 -0.09026793 0.01115431
## [31] -0.02693364 -0.01182055 -0.31288506 -0.10097646 -0.16269228
## [36] 0.01270456 -0.07877874 0.01059564 -0.01883601 -0.01079727
## [41] -0.30843451 -0.08958900 -0.16612164 0.03353856 -0.04141768
## [46] 0.03765834 -0.04373048 0.03656251 -0.04664013 0.03553765
## [51] -0.04399716 0.03319074 -0.04154238 0.03246281 -0.04005872
#we take first two components and plot their corresponding coordinates with color_coded base of class values
#after PCA, we end up with 2-D representation of the data
plot(pca$scores[,1],pca$scores[,2],main='PCA',xlab='pca$scores[,1]', ylab='pca$scores[,2]',col=class)

par(mfrow=c(1,3))
plot(pca$scores[,1],pca$scores[,2],main='PCA',xlab='pca$scores[,1]', ylab='pca$scores[,2]',col=class_H)
plot(pca$scores[,1],pca$scores[,2],main='PCA',xlab='pca$scores[,1]', ylab='pca$scores[,2]',col=class_T)
plot(pca$scores[,1],pca$scores[,2],main='PCA',xlab='pca$scores[,1]', ylab='pca$scores[,2]',col=class_A)
#from these plots we can conclude that the 2-D representation we have after PCA, we can obtain significant information regarding the game outcomes. The shape looks like the one we had for over-under case, but notice that class values, which shows game results, are somewhat grouped. Home-win results are located mostly on the long left tail of the tail, tie results are spread but mostly in the center and away-win results are on the short right tail of the shape. Building a classification model, we can predict outcomes for the upcoming games.
###TASK 3
library("jpeg")
library("raster")
## Loading required package: sp
##
## Attaching package: 'raster'
## The following object is masked from 'package:data.table':
##
## shift

#reading image and checking its structure
img=readJPEG("/users/tarkantemizoz/desktop/HW2.jpg")
img1=readJPEG("/users/tarkantemizoz/desktop/HW2.jpg",TRUE)
str(img)
## num [1:512, 1:512, 1:3] 0.271 0.275 0.278 0.278 0.278 ...
dim(img)
## [1] 512 512 3
#our image has 3 channels(dimensions) and a 512x512 matrix associated with each channel
plot(0:512, 0:512, type="n")
rasterImage(img, 0, 0, 512, 512,interpolate=FALSE)
#displaying each channel
red<-img[1:512,1:512,1]
green<-img[1:512,1:512,2]
blue<-img[1:512,1:512,3]
par(mfrow=c(1,3))

image(1:512,1:512,t(apply(red,2,rev)))
image(1:512,1:512,t(apply(green,2,rev)))
image(1:512,1:512,t(apply(blue,2,rev)))

par(mfrow=c(1,1))
plot(blue,pch=19,col="blue",main='Color Intensity',xlab='', ylab='')
points(green,pch=19,col="green")
points(red,pch=19,col="red")

#creating noisy image
rednoise<-matrix(runif(512*512,0,0.1),512,512)
greennoise<-matrix(runif(512*512,0,0.1),512,512)
bluenoise<-matrix(runif(512*512,0,0.1),512,512)
red=red+rednoise
green=green+greennoise
blue=blue+bluenoise
img[1:512,1:512,1]=red
img[1:512,1:512,2]=green
img[1:512,1:512,3]=blue
img=ifelse(img>1,1,img) #channel pixel values should be between 0 and 1
plot(0:512, 0:512, type="n")
rasterImage(img, 0, 0, 512, 512,interpolate=FALSE)

#displaying channels
par(mfrow=c(1,3))
image(1:512,1:512,t(apply(red,2,rev)))
image(1:512,1:512,t(apply(green,2,rev)))
image(1:512,1:512,t(apply(blue,2,rev)))

par(mfrow=c(1,1))
plot(blue,pch=19,col="blue",main='Color Intensity',xlab='', ylab='')
points(green,pch=19,col="green")
points(red,pch=19,col="red")

#Transforming noisy image to a grayscale one
img_gr<- img[,,1]+img[,,2]+img[,,3]
img_gr <- img_gr/max(img_gr)
plot(c(0,1),c(0,1),t='n')
rasterImage(img_gr, 0,0,1,1)

str(img_gr)
## num [1:512, 1:512] 0.31 0.319 0.351 0.312 0.313 ...
#creating the data matrix with instances of 3 by 3 patches extracted for each pixel
patches_matrix=matrix(0,260100,9)
i=1 #row number of the data matrix
for(n in 2:511){
for(k in 2:511){
patch1=img_gr[n-1,(k-1):(k+1)]
patch2=img_gr[n,(k-1):(k+1)]
patch3=img_gr[n+1,(k-1):(k+1)]
patches_matrix[i,]=c(patch1,patch2,patch3)
i=i+1}}
#applying PCA
pca_image=princomp(patches_matrix)
plot(pca_image)

#see that most of the variances are covered by component 1
str(pca_image)
## List of 7
## $ sdev : Named num [1:9] 0.5774 0.0808 0.0678 0.0385 0.0289 ...
## ..- attr(*, "names")= chr [1:9] "Comp.1" "Comp.2" "Comp.3" "Comp.4" ...
## $ loadings: loadings [1:9, 1:9] -0.331 -0.335 -0.332 -0.334 -0.338 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : NULL
## .. ..$ : chr [1:9] "Comp.1" "Comp.2" "Comp.3" "Comp.4" ...
## $ center : num [1:9] 0.448 0.449 0.449 0.448 0.448 ...
## $ scale : num [1:9] 1 1 1 1 1 1 1 1 1
## $ n.obs : int 260100
## $ scores : num [1:260100, 1:9] 0.412 0.436 0.446 0.411 0.38 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : NULL
## .. ..$ : chr [1:9] "Comp.1" "Comp.2" "Comp.3" "Comp.4" ...
## $ call : language princomp(x = patches_matrix)
## - attr(*, "class")= chr "princomp"
summary(pca_image)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4
## Standard deviation 0.5774402 0.08081527 0.06776616 0.038484502
## Proportion of Variance 0.9564617 0.01873443 0.01317284 0.004248399
## Cumulative Proportion 0.9564617 0.97519613 0.98836897 0.992617367
## Comp.5 Comp.6 Comp.7 Comp.8
## Standard deviation 0.02887941 0.026581091 0.019367114 0.018945900
## Proportion of Variance 0.00239238 0.002026745 0.001075928 0.001029637
## Cumulative Proportion 0.99500975 0.997036492 0.998112421 0.999142057
## Comp.9
## Standard deviation 0.0172942760
## Proportion of Variance 0.0008579428
## Cumulative Proportion 1.0000000000
#components 1 alone covers 0.9565% of the variance, we can reduce the dimension to 1 and have over 95% variance covered.
#creating image matrices for the first three components
comp1_img=matrix(0,510,510)
i=1
for(m in 1:510)
{
comp1_img[m,]=pca_image$scores[i:(510*m),1]
i=510*m+1
}
comp2_img=matrix(0,510,510)
i=1
for(m in 1:510)
{
comp2_img[m,]=pca_image$scores[i:(510*m),2]
i=510*m+1
}
comp3_img=matrix(0,510,510)
i=1
for(m in 1:510)
{
comp3_img[m,]=pca_image$scores[i:(510*m),3]
i=510*m+1
}
plot(c(0,1),c(0,1),t='n')
rasterImage((comp1_img-min(comp1_img))/(max(comp1_img)-min(comp1_img)),0,0,1,1)

plot(c(0,1),c(0,1),t='n')
rasterImage((comp2_img-min(comp2_img))/(max(comp2_img)-min(comp2_img)),0,0,1,1)

plot(c(0,1),c(0,1),t='n')
rasterImage((comp3_img-min(comp3_img))/(max(comp3_img)-min(comp3_img)),0,0,1,1)

#see that component 1 alone gives almost every details of the original grayscale image. Component 2 and component 3 fall short giving much detail but together with component 1 they may represent the whole image.
#covariance matrix, eigenvalues and eigenvectors
cov_pca_image = cov(patches_matrix)
eigenValues = eigen(cov_pca_image)$values
eigenVectors = eigen(cov_pca_image)$vectors
eigenValues
## [1] 0.3334384757 0.0065311332 0.0045922702 0.0014810626 0.0008340235
## [6] 0.0007065571 0.0003750866 0.0003589485 0.0002990931
#we also here see that first eigenvalue is remarkably higher than the others
eigenVectors[,1]
## [1] -0.3313745 -0.3354543 -0.3321136 -0.3337820 -0.3377786 -0.3340776
## [7] -0.3306267 -0.3342382 -0.3304846
eigenVectors[,2]
## [1] 0.4315376098 0.0368702793 -0.3731514178 0.4185903422 0.0002841363
## [6] -0.4181490373 0.3707885074 -0.0367568787 -0.4292693037
eigenVectors[,3]
## [1] -0.3684246215 -0.4218698192 -0.4289941639 0.0345563106 0.0007205884
## [6] -0.0333803005 0.4307922889 0.4226328253 0.3684341973
#creating patches of eigenvectors
patch_eigenvector1=matrix(0,3,3)
patch_eigenvector2=matrix(0,3,3)
patch_eigenvector3=matrix(0,3,3)
patch_eigenvector1[1,]=eigenVectors[1:3,1]
patch_eigenvector1[2,]=eigenVectors[4:6,1]
patch_eigenvector1[3,]=eigenVectors[7:9,1]
patch_eigenvector2[1,]=eigenVectors[1:3,2]
patch_eigenvector2[2,]=eigenVectors[4:6,2]
patch_eigenvector2[3,]=eigenVectors[7:9,2]
patch_eigenvector3[1,]=eigenVectors[1:3,3]
patch_eigenvector3[2,]=eigenVectors[4:6,3]
patch_eigenvector3[3,]=eigenVectors[7:9,3]
#plotting scaled components
plot(c(0,1),c(0,1),t='n')
rasterImage((patch_eigenvector1-min(patch_eigenvector1))/(max(patch_eigenvector1)-min(patch_eigenvector1)),0,0,1,1,interpolate=FALSE)

plot(c(0,1),c(0,1),t='n')
rasterImage((patch_eigenvector2-min(patch_eigenvector2))/(max(patch_eigenvector2)-min(patch_eigenvector2)),0,0,1,1,interpolate=FALSE)

plot(c(0,1),c(0,1),t='n')
rasterImage((patch_eigenvector3-min(patch_eigenvector3))/(max(patch_eigenvector3)-min(patch_eigenvector3)),0,0,1,1,interpolate=FALSE)

#we see that first eigenvector displays an image of a patch with a clear vision of its center. Second and third components represent more like the edges of an image.